---
title: "Whose ideas are worth spreading? The representation of women and ethnic groups in TED talks"
author: "Carsten Schwemmer"
date: "July 12, 2019"
output:
  html_document: 
    highlight: tango
    toc: true
    toc_depth: 3
---

# Replication material 

## Setup

```{r setup, include = FALSE}
#adjust your working directory here
knitr::opts_knit$set(root.dir = '/home/cs/Dropbox/kasus/PhD/projects/ted/replication')
```


```{r message=FALSE, warning=FALSE}
set.seed(1337)
library(quanteda)
library(stm)
library(reshape2)
library(gridExtra)
library(lubridate)
library(grid)
library(textmineR)
library(cowplot)
library(stminsights)
library(sentimentr)
library(ggridges)
library(ggrepel)
library(ggeffects)
library(ggdendro)
library(GGally)
library(MASS)
library(cowplot)
library(sjPlot)
library(sjmisc)
library(irr)
library(caret)
library(wru)
library(genderizeR)
library(stargazer)
library(scales)
library(psych)
library(texreg)
library(arm)
library(tidyverse)
library(AER)
theme_set(theme_light())

options(scipen=999)
#colors <- brewer_pal(palette = 'Set1')(6)

df <- read_tsv('ted_main_dataset.tsv')
# remove talks without human speakers
df <- df %>% filter(!is.na(speaker_image_nr_faces))

df <- df %>% arrange(date) # order by date
df$year <-str_sub(df$date, 1, 4) # extract year identifier
df$date_num <- factor(df$date) %>% as.numeric() # create numeric date variable
```


This document includes analysis for a dataset of TED talk transcripts  which were extracted from the [TED Homepage](https://www.ted.com/talks). A small number of talks had to be removed because no (english) transcripts were available or the talks solely consisted of non-verbal activities. The dataset contains the following information:

- ``title``: title of the talk
- ``speaker``: name of the Speaker
- ``date``: month and year of the talk
- ``text``: transcript of the talk
- ``rated1``: first categorization of talk (e.g. "inspiring")
- ``rated2``: second categorization of talk (e.g. "funny")
- ``length``: number of characters for the talk transcript
- ``dummy_female``: a dummy variable for speaker gender (female vs male)
- ``dummy_female``: a dummy variable for speaker ethnicity (white vs non-white)
- ``speaker_*``: several attributes on the speaker level, including info about speakers on the TED website as well as annotation from the image recognition service
- ``yt_*``: several attributes for YouTube info on the level of TED talks, including video id, video url, and comment sentiment.



```{r }
head(df, 5)
```

## Descriptive Representation

### Gender

```{r}

df_year_gender <- df %>%
  group_by(year, dummy_female) %>%
  summarise(n = n()) %>%
  na.omit() %>%
  mutate(freq = (n / sum(n))) %>%
  ungroup()
df_year_gender$female <-  factor(df_year_gender$dummy_female,
                                 labels = c('Male', 'Female'))

cols_gender <- c('Male' = '#778899', 'Female' = '#377eb8')

p_gender <-
  df_year_gender %>% ggplot(aes(
    x = year,
    y = freq,
    group = female,
    color = female
  )) +
  geom_point(size = 2.5) +
  geom_line(size = 0.9) +
  scale_color_manual(values = cols_gender) +
  scale_x_discrete(expand = c(0, 0.3)) +
  scale_y_continuous(
    breaks = c(0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
    labels = percent,
    limits = c(0.0, 0.9),
    expand = c(0, 0)
  ) +
  theme(
    legend.position = c(0.5, 0.55),
    legend.title = element_blank(),
    legend.background = element_rect(fill = "grey95")
  ) +
  labs(y = 'Percentage', x = 'Year', title = 'TED talks by gender') +
  annotate(
    "text",
    label = "Male",
    x = '2007',
    y = 0.7,
    color = "#778899",
    size = 4.5
  ) +
  annotate(
    "text",
    label = "Female",
    x = '2007',
    y = 0.3,
    color = "#377eb8",
    size = 4.5
  ) + guides(color = FALSE)
```

### Ethnicity



```{r}
df_year_mode <- df %>%
  group_by(year, speaker_ethnicity) %>%
  summarise(n = n()) %>% 
  na.omit() %>% 
  mutate(freq = (n / sum(n))) %>%
  ungroup() 


cols <- c('White' = '#778899', 'Asian' = '#984ea3', 'Black' = '#4daf4a',
            'Hispanic' = '#377eb8', 'Other' = '#e41a1c')

ltypes <-
  c(
    'White' = 19,
    'Black' = 17,
    'Asian' = 18,
    'Hispanic' = 15,
    'Other' = 3
  )

p_ethnicity <-
  df_year_mode %>% ggplot(
    aes(
      x = year,
      y = freq,
      group = speaker_ethnicity,
      color = speaker_ethnicity,
      shape = speaker_ethnicity
    )
  ) +
  geom_point(size = 2.5, alpha = 0.9) +
  geom_line(size = 0.9, alpha = 0.9) +
  scale_color_manual(values = cols) +
  scale_x_discrete(expand = c(0, 0.2)) + scale_y_continuous(
    breaks = c(0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
    labels = percent,
    limits = c(0.0, 0.9),
    expand = c(0, 0.01)
  ) +
  theme(
    legend.position = c(0.1, 0.5),
    legend.title = element_blank(),
    legend.background = element_rect(fill = "grey95")
  ) +
  labs(y = 'Percentage', x = 'Year', title = 'TED talks by ethnicity') +
  scale_shape_manual(values = ltypes)
```
 
 
### Combined (Figure 2) 
 
```{r fig.height=6, fig.width=9}
p_ethnicity <- p_ethnicity + theme(axis.text.x=element_text(angle=90, hjust=1),
                                   legend.position = c(0.2, 0.5)) +
  labs(y = 'Percentage of talks', x = 'Year', title = NULL)
p_gender <- p_gender + theme(axis.text.x=element_text(angle=90, hjust=1)) +
  labs(y = 'Percentage of talks', x = 'Year',title = NULL)
g <- cowplot::plot_grid(p_gender, p_ethnicity, labels = c('A', 'B'),
                        align = 'hv')
g
ggsave('figures/figure2.png',
       plot = g, dpi = 300, width = 9, height = 6)
ggsave('figures/figure2.pdf',
       plot = g,  width = 9, height = 6)
```

## Substantive Representation (Text Analysis)

### Create corpus

```{r}
ted <- corpus(df$text)
ted_dfm <- dfm(ted,
            stem = TRUE, tolower = TRUE,
            remove_twitter = FALSE, remove_punct = TRUE,
            remove_url = FALSE, remove_numbers =TRUE, verbose = TRUE,
            remove = stopwords('english'))
# feature trimming
ted_dfm <- dfm_trim(ted_dfm, max_docfreq = 0.50,min_docfreq = 0.01,
                    docfreq_type = 'prop')
dim(ted_dfm)
```

### Run structural topic models

Please note that the following block may take a considerable amount of time and, depending on your hardware, not all objects might fit in memory. In the replication files, you can find the file ``ted_ready_for_analysis.RData``, which contains all the objects you need to run the remainder of our analysis.

```{r, eval = FALSE}
out <-
  convert(ted_dfm, to = 'stm', docvars = df) # convert to dfm format

# running model with 20 topics
stm_20 <- stm(
  out$documents,
  out$vocab,
  data = out$meta,
  prevalence =  ~ s(date_num, degree = 2) + dummy_female * dummy_white,
  init.type = 'Spectral',
  K = 20,
  verbose = F
)

effect_stm_20 <-
  estimateEffect(1:20 ~ s(date_num, degree = 2) + dummy_female * dummy_white,
                 stm_20,
                 metadata = out$meta)

# running model with 30 topics
stm_30 <- stm(
  out$documents,
  out$vocab,
  data = out$meta,
  prevalence =  ~  s(date_num, degree = 2) + dummy_female + dummy_white,
  init.type = 'Spectral',
  K = 30,
  verbose = F
)

effect_stm_30 <-
  estimateEffect(1:30 ~ s(date_num, degree = 2) + dummy_female + dummy_white,
                 stm_30,
                 metadata = out$meta)

# running model with 50 topics
stm_50 <- stm(
  out$documents,
  out$vocab,
  data = out$meta,
  prevalence =  ~  s(date_num, degree = 2) + dummy_female + dummy_white,
  init.type = 'Spectral',
  K = 50,
  verbose = F
)

effect_stm_50 <-
  estimateEffect(
    1:50 ~ s(date_num, degree = 2) + dummy_female + dummy_white,
    stm_50,
    metadata = out$meta,
    uncertainty =  'None'
  )

# save objects for stminsights
save.image('ted_topic_models.RData')


# getting semantic coherence and exclusivity scores
quality <-
  get_diag(
    models = list(
      model_20 = stm_20,
      model_30 = stm_30,
      model_50 = stm_50
    ),
    outobj = out
  )
# plotting scores (figure S2)
quality %>%
  mutate(nr_topics = as.factor(nr_topics)) %>%
  ggplot(aes(
    x = coherence,
    y = exclusivity,
    color = statistic,
    shape = nr_topics
  ))  +
  geom_text_repel(aes(label = nr_topics),
                  size = 4.5,
                  show.legend = FALSE) + geom_point() +
  labs(
    x = 'Semantic Coherence',
    y = 'Exclusivity',
    shape = 'No. of topics',
    color = 'Statistic'
  ) + theme_light() +
  theme(
    legend.position = c(0.9, 0.7),
    legend.background = element_rect(color = "grey50")
  ) +
  scale_color_brewer(palette = 'Set1')

ggsave('figures/supporting_information_figure2.png',
      width = 8, height = 6, dpi = 300)


# run stminsights to validate topic models
library(stminsights)
run_stminsights()
```

After inspecting the different topic models, we found that the model with 30 topics includes the most useful topics for our analysis. We store objects for this model in a separate file.

```{r, eval = FALSE}
save(out, ted_dfm, stm_30, effect_stm_30, file = "ted_ready_for_analysis.RData")
save.image()
unlink("ted_ready_for_analysis.RData")
```


## Ted Talks - Sentiment


This code is used to calculate sentiment scores for more than 1,000,000 YouTube comments and might therefore talk quite some time to execute. Instead of running this code, you can load 'ted_yt-comments_sentiment.tsv' (see below), which contains a dataset with sentiment scores ready for analysis.

```{r, eval = FALSE}
yt <- read_tsv('ted_yt-comments.tsv')

# prepare texts for sentiment analysis
clean_text <- function(x) {
  x <- x %>% replace_internet_slang() %>% 
    replace_emoticon() %>% 
    replace_emoji_identifier()
  return(x)
}

# compute sentiment
yt$comment_processed <- clean_text(yt$comment_text)

sents <- yt %>%  select(yt_id, comment_id, comment_processed) %>% 
  get_sentences() 

out <- with(sents, sentiment_by(comment_processed, comment_id))
colnames(out) <- c('comment_id',
   'comment_sents_word_count', 'comment_avg_sentiment_sd', 'comment_avg_sentiment')

# merge datasets by comment id's
yt <- yt %>% left_join(out, by = 'comment_id')

# find examples with high negative sentiment
sent_neg <- yt %>% arrange(comment_avg_sentiment) %>% 
  select(comment_processed, comment_avg_sentiment, yt_id) %>%
  mutate(polarity = 'neg') %>% 
  head(50) 

# find examples with high positive sentiment
sent_pos <- yt %>% arrange(desc(comment_avg_sentiment)) %>% 
  select(comment_processed, comment_avg_sentiment, yt_id) %>%
  mutate(polarity = 'pos') %>% 
  head(50) 

# compute averages
yt_sent <- yt %>% group_by(yt_id) %>% summarize(
    yt_comments_avg_wordcount = mean(comment_sents_word_count),
    yt_comments_total_wordcount = sum(comment_sents_word_count),
    yt_comments_avg_sentiment = mean(comment_avg_sentiment))

#save sentiment scores 
write_tsv(yt_sent, 'ted_yt-comments_sentiment.tsv')
```


## Reload data for analysis

```{r message=FALSE, warning=FALSE}
# load main dataset
df <- read_tsv('ted_main_dataset.tsv')
df <- df %>% filter(!is.na(speaker_image_nr_faces)) 

# load topic models
load('ted_ready_for_analysis.RData')

# create several variables for analysis
df <- df %>% arrange(date)
df$date_num <- factor(df$date) %>% as.numeric()
df$year <-str_sub(df$date, 1, 4)
df$yt_views_l <- log(df$yt_views + 1)
df$yt_dislikes_l <- log(df$yt_dislikes + 1)
df$yt_comments_l <- log(df$yt_comments + 1)
df$yt_date_num <- df$yt_date %>% str_sub(1, 4) %>% as.integer()
df$factor_female <- as.factor(df$dummy_female)
df$factor_white <- as.factor(df$dummy_white)
levels(df$factor_female) <- c('male', 'female')
levels(df$factor_white)[levels(df$factor_white)== 1] <- 'white'
levels(df$factor_white)[levels(df$factor_white)== 0] <- 'non-white'
df$factor_white <- factor(df$factor_white, levels = c('white', 'non-white'))

# read sentiment scores
yt_sent <- read_tsv('~/Dropbox/kasus/PhD/projects/ted/data/yt_comments_sentiment.tsv')
df <- df %>% left_join(yt_sent, by = 'yt_id')
# normalize sentiment scores
df$sentiment <- df$yt_comments_avg_sentiment -  
  mean(df$yt_comments_avg_sentiment, na.rm = TRUE)

# extract stm topic proportions
props <- as.data.frame(stm_30$theta) # topic proportions
colnames(props) <- paste0('T_', as.character(1:ncol(props))) # topic names
props$maxprop <-  apply(props, 1, which.max)
df <- bind_cols(df, props)
# create identifier for inequality topic
df$max_t4 <- ifelse(df$maxprop == 4, 1, 0)
df$maxprop_f <- to_factor(df$maxprop, ref.lvl = 4)

# vector of topic labels
topic_labels <- 
  c(
    'work & misc.',
    'games & music',
    'stopwords',
    'inequality',
    'countries & poverty',
    'universe & physics',
    'family',
    'war & terror',
    'astronautics',
    'genetics',
    'architecture & design',
    'environment',
    'materials & sustainability',
    'law & politics',
    'art',
    'food & farming',
    'language & writing',
    'religion & mystery',
    'money & business',
    'neuroscience & robotics',
    'children & school',
    'computers & technology',
     'general science',
    'emotions & psychology',
    'data & internet',
    'transportation',
    'animals & nature',
    'body & sports',
    'medical',
    'ocean & sea' )

```


## Representation descriptives for 2016 & 2017 (Table 1)

```{r}
df %>% filter(year %in%  c('2016', '2017') & speaker_gender != 'undefined') %>% 
  count(speaker_gender) %>%
  mutate(rel = round(n / sum(n), 3))

df %>% filter(year %in%  c('2016', '2017') & speaker_gender != 'undefined') %>% 
  count(speaker_ethnicity) %>% 
  mutate(rel = round(n / sum(n), 3))
```



## Density of ethnicity annotations (Figure S1)

```{r fig.height=6, fig.width=8, message=FALSE, warning=FALSE}
eth <- df %>% select(speaker_face_asian,
                     speaker_face_black,
                     speaker_face_white,
                     speaker_face_hispanic,
                     speaker_face_other) %>%
  set_names(~ str_replace_all(., 'speaker_face_', '')) %>% 
  gather(key = ethnicity, value = prob) %>% 
  drop_na()
  
eth %>% ggplot(aes(x = prob, y = ethnicity, fill = ..x..)) + 
  geom_density_ridges_gradient() +  
  viridis::scale_fill_viridis(option = 'C') +
     theme_light(base_size = 13) + 
  theme(axis.text.y = element_text(vjust = 0), legend.position="none") +
   scale_x_continuous(expand = c(0.01, 0), labels = scales::percent) +
   scale_y_discrete(expand = c(0.01, 0)) +
  labs(x = 'Probability', y = 'Ethnicity')


ggsave('figures/supporting_information_figure1.png',
width = 8, height = 6, dpi = 300)
```


### Topic prevalence effects (Figure S3)

```{r fig.height=8, fig.width=8, message=FALSE, warning=FALSE}
pal <- c('#377eb8', '#e41a1c', '#4daf4a')

get_date <- function(x, startdate = "2006-06-01") {
  startdate <- ymd(startdate)
  x <- round(x, 0)
  return(startdate + months(x - 1))
}


effects_white <- get_effects(estimates = effect_stm_30,
                             variable = 'dummy_white',
                             type = 'pointestimate') %>%
  mutate(value = if_else(value == 0, 'Other', 'White')  %>% as.factor())


effects_female <- get_effects(estimates = effect_stm_30,
                              variable = 'dummy_female',
                              type = 'pointestimate') %>%
  mutate(value = if_else(value == 0, 'Male', 'Female')  %>% as.factor())

effects_time <- get_effects(estimates = effect_stm_30,
                            variable = 'date_num',
                            type = 'continuous') %>%
  mutate(date = get_date(value))



plot_female <- effects_female %>% filter(topic == 4)  %>%
  ggplot(aes(y = value, x = proportion, group = 1)) +
  geom_point(color = '#778899', size = 1.5) +
  geom_line(color = '#778899', size = 0.5) + guides(fill = F,
                                                    color = F,
                                                    group = F)  +
  geom_errorbarh(
    aes(xmin = lower, xmax = upper),
    height = 0.08,
    color = '#778899',
    size = 0.5
  ) + coord_flip()  +
  theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank()
  )   +
  scale_x_continuous(labels = percent,
                     breaks = c(-0.01, 0.00, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06)) +
  labs(x = 'Topic proportion: inequality', y = 'Gender') #+


plot_white <- effects_white %>% filter(topic == 4)  %>%
  ggplot(aes(y = value, x = proportion, group = 1)) +
  geom_point(color = '#778899', size = 1.5) +
  geom_line(color = '#778899', size = 0.5) + guides(fill = F,
                                                    color = F,
                                                    group = F)  +
  geom_errorbarh(
    aes(xmin = lower, xmax = upper),
    height = 0.08,
    color = '#778899',
    size = 0.5
  ) + coord_flip()  +
  theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank()
  )   +
  scale_x_continuous(labels = percent,
                     breaks = c(-0.01, 0.00, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06)) +
  labs(x = 'Topic proportion: inequality', y = 'Ethnicity') 

break.vec <-
  c(seq(
    from = as.Date("2006-06-01"),
    to = as.Date("2017-05-01"),
    by = "6 months"
  ),
  as.Date("2017-04-01"))

plot_time <- effects_time %>% filter(topic == 4)  %>%
  ggplot(aes(x = date, y = proportion)) +
  geom_line(color = '#778899') +
  geom_ribbon(aes(ymin = lower, ymax = upper),
              alpha = 0.2,
              fill = '#778899') +
  scale_x_date(
    breaks = break.vec,
    minor_break = break.vec,
    date_labels = "%Y/%m",
    expand = c(0.01, 0.0)
  ) +
  scale_y_continuous(breaks = pretty_breaks(n = 8),
                     expand = c(0.00, 0),
                     labels = percent) +
  guides(fill = F,
         color = F,
         group = F)  +
  labs(x = 'Date' , y  = 'Topic proportion: inequality') +
  scale_color_manual(values = pal) +
  scale_fill_manual(values = pal) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank()
  )



g1 <- cowplot::plot_grid(plot_white,
                         plot_female,
                         ncol = 2,
                         labels = c('A', 'B'))

g2 <- cowplot::plot_grid(g1, plot_time, ncol = 1,
                         labels = c('', 'C'))
g2

ggsave('figures/supporting_information_figure3.png', plot = g2,
          width = 8, height = 8, dpi = 300)

```



### Topic clusters and correlations (Figure 3)

```{r fig.height=6, fig.width=9, message=FALSE, warning=FALSE}
# hierarchical clustering
stm_dist <- CalcHellingerDist(stm_30$theta, by_rows = FALSE)
clusters <- hclust(as.dist(stm_dist), "ward.D2")
clusters$labels <- topic_labels

# create dendrogram
dendro <- ggdendrogram(clusters, rotate = TRUE, size = 50)  +
  theme(axis.text.y = element_text(size = 10),
        axis.title = element_text(size = 14, face = "bold")) +
  
  labs(title = NULL, x = 'Distance')   + scale_y_continuous(labels = NULL)

df$dummy_nonwhite <- rec(df$dummy_white, rec = 'rev')


# create correlation matrix
cormat <-
  cbind(stm_30$theta,
        df %>% select(dummy_female, dummy_nonwhite,
                      date_num)) %>%
  corr.test()
cormat <- cormat$r

pmat <-
  cbind(stm_30$theta,
        df %>% select(dummy_female, dummy_nonwhite,
                      date_num)) %>%
  corr.test()
pmat <- pmat$p


colnames(cormat) <- c(topic_labels, 'female', 'non-white', 'date')
rownames(cormat) <- c(topic_labels, 'female', 'non-white', 'date')

cormat <- cormat %>% melt()
pmat <- pmat %>% melt()
pmat$pstar <-
  case_when(
    is.na(pmat$value) ~ "",
    pmat$value < 0.001 ~ "***",
    pmat$value < 0.01 ~ "**",
    pmat$value < 0.05 ~ "*",
    TRUE ~ ""
  )
cormat <- bind_cols(cormat, pmat)
cormat <-
  cormat %>% filter(Var1 %in% c('female', 'non-white', 'date'))
cormat <-
  cormat %>% filter(!Var2 %in% c('female', 'non-white', 'date'))
cormat$Var2 <- as.character(cormat$Var2) %>%
  factor(levels = topic_labels[clusters$order], ordered = T)

cormat <- cormat %>%
  mutate(pstar  = if_else(
    value < 0,
    str_replace_all(pstar, '\\*', '-'),
    str_replace_all(pstar, '\\*', '+')
  ))

# create visualization
topcors <- ggplot(cormat, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), color = 'white') +
  geom_text(aes(label = pstar)) +
  scale_fill_gradient2(
    low = "#0571b0",
    mid = 'white',
    high = "#ca0020",
    midpoint = 0,
    limits = c(-0.3, 0.3),
    breaks = c(-0.3, -0.15, 0, 0.15, 0.3)
  ) +
  theme(
    axis.text.y = element_text(size = 10),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    legend.position = 'right',
    legend.direction = 'vertical',
    plot.title = element_text(hjust = 1)
  )  +
  labs(y = NULL,
       x = NULL,
       fill = 'Correlation',
       title = NULL) +
  scale_x_discrete(
    expand = c(0, 00),
    labels = c('Gender:\nfemale', 'Ethnicity:\nnon-white',
               'Date')
  ) +
  theme(axis.text.y = element_text(face =
                                     c(rep('plain', 10), 'bold', rep('plain', 26))))


g <- cowplot::plot_grid(
  dendro,
  topcors,
  align = 'hv',
  labels = c('A', 'B'),
  hjust = 0.0
)
g
ggsave(
  'figures/figure3.png',
  g,
  width = 9,
  height = 6,
  dpi = 300
)
ggsave('figures/figure3.pdf', g,
       width = 9, height = 6)

```



## Tables for topic proportions and FREX terms (Supporting Information S2)

```{r, eval = FALSE}
get_props <- function(proportions) {
  names_ <- colnames(proportions)
  print(names_)
  frequency <-colMeans(proportions)
  order <- order(frequency, decreasing = FALSE)
  percentage <- frequency[order]
  names_ <- names_[order]
  topic <- factor(names_, levels=names_)
  combined <- data.frame(percentage, topic)
  combined <- transform(combined,
                        mid_y = ave(combined$percentage,
                                              combined$topic, 
                                              FUN = function(val) cumsum(val) - (0.5 * val)))
  combined$label <- topic_labels
  
  return(combined)
}

labels_30 <- labelTopics(stm_30, n= 20)$frex %>% t() %>% as.tibble()
labels_30 <- labels_30 %>% mutate_all(funs(str_c(., collapse = ', '))) %>% 
  slice(1) %>% gather(topic, frex_terms) %>% 
  mutate(topic = str_replace(topic, 'V', 'Topic '))
labels_30$label <- topic_labels 


props_30 <- get_props(props[, 1:30 ])

comb_30 <- left_join(labels_30, props_30, by = c('label')) %>% 
  mutate(proportion = round(percentage, 2)) %>% 
  select(label, proportion, frex_terms) %>% 
  arrange(desc(proportion))

# create data table
frex_table <- stargazer(comb_30,  summary = FALSE, rownames = FALSE)

# store as tex file
write_file(str_c(frex_table, collapse = '\n'),
           path = 'figures/ted_topicterms.tex' )

```

## Regressions


###  YouTube Sentiment (Figure 4)

```{r fig.height=8, fig.width=8, message=FALSE, warning=FALSE}
pal <- c('#377eb8', '#e41a1c', '#4daf4a')

df_reg <-
  df %>% dplyr::select(
    sentiment,
    yt_date_num,
    yt_views,
    factor_female,
    factor_white,
    starts_with('T_'),
    -T_1
  )

reg2 <- glm(sentiment ~ yt_date_num  + yt_views ,  data = df_reg)
reg3 <- glm(sentiment ~ yt_date_num + yt_views   +
              factor_female + factor_white,
            data = df_reg)
reg_sent <- glm(sentiment ~  . , data = df_reg)


sent_pred <- ggeffect(reg_sent, x.as.factor = TRUE)

# STM variables
se1 <- sent_pred$factor_female %>%
  ggplot(aes(y = x, x = predicted, group = 1)) +
  geom_point(color = '#778899', size = 1.5) +
  geom_line(color = '#778899', size = 0.5) +
  guides(fill = F,
         color = F,
         group = F)  +
  geom_errorbarh(
    aes(xmin = conf.low, xmax = conf.high),
    height = 0.08,
    color = '#778899',
    size = 0.5
  ) + coord_flip()  +
  theme(panel.grid.minor.x = element_blank()) +
  labs(y = 'Gender',  x = 'Comments sentiment', title = '') +
  theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank()
  )   +
  scale_x_continuous(breaks = pretty_breaks(n = 5))


se2 <- sent_pred$factor_white %>%
  ggplot(aes(y = x, x = predicted, group = 1)) +
  geom_point(color = '#778899', size = 1.5) +
  geom_line(color = '#778899', size = 0.5) +
  guides(fill = F,
         color = F,
         group = F)  +
  geom_errorbarh(
    aes(xmin = conf.low, xmax = conf.high),
    height = 0.08,
    color = '#778899',
    size = 0.5
  ) + coord_flip()  +
  theme(panel.grid.minor.x = element_blank()) +
  labs(y = 'Ethnicity',  x = 'Comments sentiment', title = '') +
  theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank()
  )   +
  scale_x_continuous(breaks = pretty_breaks(n = 5))

sent_pred <- ggeffect(reg_sent)
se3 <- sent_pred$T_4 %>%
  ggplot(aes(x = x, y = predicted)) +
  geom_line(color = '#778899') +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high),
              alpha = 0.2,
              fill = '#778899') +
  scale_x_continuous(
    breaks = pretty_breaks(n = 8),
    expand = c(0.00, 0),
    labels = percent,
    limits = c(0, 0.6)
  ) +
  scale_y_continuous(
    breaks = pretty_breaks(n = 7),
    expand = c(0.00, 0),
    limits = c(-0.15, NA)
  ) +
  
  guides(fill = F,
         color = F,
         group = F)  +
  labs(y = 'Comment sentiment' , x  = 'Topic proportion: inequality') +
  scale_color_manual(values = pal) +
  scale_fill_manual(values = pal) +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 1),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank()
  )


# combine plots
s_c2 <- cowplot::plot_grid(
  se1,
  se2,
  se3,
  ncol = 1,
  labels = c("B.1", "B.2", "B.3"),
  align = 'hv'
)


reg_sent_df <- get_model_data(reg_sent, sort.est = TRUE)

tlabels <-
  c(
    'Video date',
    'Video views',
    'Gender: female',
    'Ethnicity: non-white',
    str_c('Topic: ', topic_labels[2:30])
  )
names(tlabels) <-
  c(
    'yt_date_num',
    'yt_views',
    'factor_femalefemale',
    'factor_whitenon-white',
    str_c('T_', 2:30)
  )


# standardize coefficients
df_std <- df_reg %>% dplyr::select(-sentiment) %>%
  mutate_all(arm::rescale) %>% mutate(sentiment = df_reg$sentiment)
reg_sent_sd <- glm(sentiment ~  . , data = df_std)


reg_sent_df_sd <- get_model_data(reg_sent_sd, sort.est = TRUE)

tlabels <-
  c(
    'Video date',
    'Video views',
    'Gender: female',
    'Ethnicity: non-white',
    str_c('Topic: ', topic_labels[2:30])
  )
names(tlabels) <- c('yt_date_num',
                    'yt_views',
                    'factor_female',
                    'factor_white',
                    str_c('T_', 2:30))

# create plot
plot_sent_sd <-
  reg_sent_df_sd %>% ggplot(aes(
    x = estimate,
    y = term,
    color = group,
    shape = group
  )) +
  geom_point(size = 2) + scale_y_discrete(labels = tlabels) +
  scale_x_continuous(breaks = pretty_breaks(n = 5)) +
  geom_errorbarh(aes(
    xmin = conf.low,
    xmax = conf.high,
    height = 0
  )) +
  geom_vline(xintercept = 0, linetype = 2) +
  labs(x = 'Standardized coefficients',
       y = 'Covariate') +
  guides(color = FALSE, shape = FALSE) +
  scale_color_brewer(palette = 'Set1') +
  theme(axis.text.y = element_text(face =
                                     c(
                                       rep('plain', 2),
                                       'bold',
                                       rep('plain', 9),
                                       'bold',
                                       rep('plain', 19),
                                       'bold'
                                     )))

cowplot::plot_grid(plot_sent_sd + labs(y = NULL),
          s_c2,
          ncol = 2,
          labels = c('A', NULL))

ggsave('figures/figure4.png', width = 8, height = 8, dpi = 300)
ggsave('figures/figure4.pdf', width = 8, height = 8)



```


### Sentiment interaction (Figure S4)

```{r fig.height=6, fig.width=9, message=FALSE, warning=FALSE}
df_std2 <- df_reg %>% dplyr::select(-sentiment) %>%
  mutate(dummy_white = ifelse(factor_white == 'white', 0, 1),
         sentiment = df_reg$sentiment) %>% 
  dplyr::select(-factor_white)


reg_sent_sd2 <- glm(sentiment ~  . + dummy_white * yt_date_num , data = df_std2)

plot_model(reg_sent_sd2, type = 'pred', 
           terms = c('yt_date_num', 'dummy_white')) + 
  scale_x_continuous(breaks = pretty_breaks(n= 5)) +
  scale_color_discrete(name = 'Ethnicity', labels = c('White', 'Non-White')) +
  labs(x = 'Date', y = 'Sentiment', title = NULL)

ggsave('figures/supporting_information_figure4.png',
      width = 9, height = 6, dpi = 300)
```



### YouTube Dislikes (Figure S5)

```{r fig.height=8, fig.width=8, message=FALSE, warning=FALSE}
df_reg2 <-
  df %>% dplyr::select(
    yt_dislikes,
    yt_date_num,
    yt_views,
    factor_female,
    factor_white,
    starts_with('T_'),
    -T_1
  )

# dispersion test suggests to use negative binomial instead of poisson
# for dislike counts
AER::dispersiontest(glm(yt_dislikes ~  . , data = df_reg2, family = poisson),
                    trafo = 1)


dreg2 <-
  glm.nb(yt_dislikes ~ yt_date_num  + yt_views ,  data = df_reg2)

dreg3 <- glm.nb(yt_dislikes ~ yt_date_num + yt_views    +
                  factor_female + factor_white,
                data = df_reg2)

reg_dlikes <- glm.nb(yt_dislikes ~  . , data = df_reg2)



dlikes_pred <- ggeffect(reg_dlikes, x.as.factor = TRUE)


# STM variables
s1 <- dlikes_pred$factor_female %>%
  ggplot(aes(y = x, x = predicted, group = 1)) +
  geom_point(color = '#778899', size = 1.5) +
  geom_line(color = '#778899', size = 0.5) +
  guides(fill = F,
         color = F,
         group = F)  +
  geom_errorbarh(
    aes(xmin = conf.low, xmax = conf.high),
    height = 0.08,
    color = '#778899',
    size = 0.5
  ) + coord_flip()  +
  theme(panel.grid.minor.x = element_blank()) +
  labs(y = 'Gender',  x = 'Video dislikes', title = '') +
  theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank()
  )   +
  scale_x_continuous(breaks = pretty_breaks(n = 5))

s2 <- dlikes_pred$factor_white %>%
  ggplot(aes(y = x, x = predicted, group = 1)) +
  geom_point(color = '#778899', size = 1.5) +
  geom_line(color = '#778899', size = 0.5) +
  guides(fill = F,
         color = F,
         group = F)  +
  geom_errorbarh(
    aes(xmin = conf.low, xmax = conf.high),
    height = 0.08,
    color = '#778899',
    size = 0.5
  ) + coord_flip()  +
  theme(panel.grid.minor.x = element_blank()) +
  labs(y = 'Ethnicity',  x = 'Video dislikes', title = '') +
  theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank()
  )   +
  scale_x_continuous(breaks = pretty_breaks(n = 5))

dlikes_pred <- ggeffect(reg_dlikes)
s3 <- dlikes_pred$T_4 %>%
  ggplot(aes(x = x, y = predicted)) +
  geom_line(color = '#778899') +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high),
              alpha = 0.2,
              fill = '#778899')  +
  scale_x_continuous(
    breaks = pretty_breaks(n = 8),
    expand = c(0.00, 0),
    labels = percent,
    limits = c(0, 0.6)
  ) +
  scale_y_continuous(
    breaks = pretty_breaks(n = 7),
    expand = c(0.00, 0),
    limits = c(0, 1600)
  ) +
  
  guides(fill = F,
         color = F,
         group = F)  +
  labs(y = 'Video dislikes' , x  = 'Topic proportion: inequality') +
  scale_color_manual(values = pal) +
  scale_fill_manual(values = pal) +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 1),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank()
  )

s_c <- cowplot::plot_grid(
  s1,
  s2,
  s3,
  align = 'hv',
  ncol = 1,
  labels = c('B.1', 'B.2', 'B.3')
)


reg_dlikes_df <- get_model_data(reg_dlikes, sort.est = TRUE,
                                transform = NULL)

tlabels <-
  c(
    'Video date',
    'Video views',
    'Gender: female',
    'Ethnicity: non-white',
    str_c('Topic: ', topic_labels[2:30])
  )
names(tlabels) <-
  c(
    'yt_date_num',
    'yt_views',
    'factor_femalefemale',
    'factor_whitenon-white',
    str_c('T_', 2:30)
  )

plot_dlike <-
  reg_dlikes_df %>% ggplot(aes(x = estimate, y = term, color = group)) +
  geom_point(size = 2) + scale_y_discrete(labels = tlabels) +
  scale_x_continuous(breaks = pretty_breaks(n = 6)) +
  geom_errorbarh(aes(
    xmin = conf.low,
    xmax = conf.high,
    height = 0
  )) +
  geom_vline(xintercept = 0, linetype = 2) +
  labs(x = 'Coefficient: YouTube video dislikes', y = 'Covariate') +
  guides(color = FALSE) +
  scale_color_manual(values = c('#377eb8', '#e41a1c')) +
  theme(axis.text.y = element_text(face =
                                     c(
                                       rep('plain', 18),
                                       'bold',
                                       rep('plain', 7),
                                       'bold',
                                       rep('plain', 5),
                                       'bold'
                                     )))

# standardize coefficients
df_std2 <- df_reg2 %>% dplyr::select(-yt_dislikes) %>%
  mutate_all(arm::rescale) %>% mutate(yt_dislikes = df_reg2$yt_dislikes)
reg_dlikes_sd <- glm.nb(yt_dislikes ~  . , data = df_std2)



reg_dlikes_df_sd <- get_model_data(reg_dlikes_sd, sort.est = TRUE,
                                   transform = NULL)

tlabels <-
  c(
    'Video date',
    'Video views',
    'Gender: female',
    'Ethnicity: non-white',
    str_c('Topic: ', topic_labels[2:30])
  )
names(tlabels) <- c('yt_date_num',
                    'yt_views',
                    'factor_female',
                    'factor_white',
                    str_c('T_', 2:30))


# create visualization
plot_dlikes_sd <-
  reg_dlikes_df_sd %>% ggplot(aes(
    x = estimate,
    y = term,
    color = group,
    shape = group
  )) +
  geom_point(size = 2) + scale_y_discrete(labels = tlabels) +
  scale_x_continuous(breaks = pretty_breaks(n = 7)) +
  geom_errorbarh(aes(
    xmin = conf.low,
    xmax = conf.high,
    height = 0
  )) +
  geom_vline(xintercept = 0, linetype = 2) +
  labs(x = 'Standardized coefficients',
       y = 'Covariate') +
  guides(color = FALSE, shape = FALSE) +
  scale_color_manual(values = c('#377eb8', '#e41a1c')) +
  theme(axis.text.y = element_text(face =
                                     c(
                                       rep('plain', 10),
                                       'bold',
                                       rep('plain', 17),
                                       'bold',
                                       rep('plain', 1),
                                       'bold',
                                       rep('plain', 2)
                                     )))


cowplot::plot_grid(plot_dlikes_sd + labs(y = NULL),
                   s_c,
                   ncol = 2,
                   labels = c('A', NULL))


ggsave('figures/supporting_information_figure5.png', 
       width = 8, height = 8, dpi = 300)
```

### Regression tables (S3)

```{r, eval = FALSE}
texreg(
  list(reg_sent, reg_dlikes),
  dcolumn = TRUE,
  booktabs = TRUE,
  digits = 2,
  custom.model.names = c('Comment sentiment', 'Video dislikes'),
  custom.coef.names = c('Intercept',
                        tlabels %>% str_replace('&', 'and')),
  single.row = TRUE,
  use.packages = FALSE,
  label = "tab:2",
  caption = "Regression models for comment sentiment and video dislikes of TED talks on YouTube",
  float.pos = "H",
  file = 'figures/regression_tables.tex'
)       

```

## Supporting Information S1 - Validation of image recognition annotations

```{r}
# read file for human codings and image recognition annotations
comb <- read_tsv('ted_talks_validation.tsv')
head(comb)
```

[Cohen's Kappa](https://en.wikipedia.org/wiki/Cohen%27s_kappa) will be used to determine the interrater agreement between human coders and the [Kairos face recognition algorithm](https://www.kairos.com/docs/api/).


### Pairwise Agreement: Ethnicity

```{r}
eth_kappa_1 <- comb %>% select(speaker_ethnicity1, speaker_ethnicity) %>% 
  kappam.fleiss(detail = TRUE)
eth_kappa_2 <- comb %>% select(speaker_ethnicity2, speaker_ethnicity) %>% 
  kappam.fleiss(detail = TRUE)
eth_kappa_3 <- comb %>% select(speaker_ethnicity2, speaker_ethnicity1) %>% 
  kappam.fleiss(detail = TRUE)

eth_kappa_1  # rater 1 / algorithm

eth_kappa_2  # rater 2 / algorithm

eth_kappa_3  # rater 1 / rater 2

# raters / machine average
mean(c(eth_kappa_1$value, eth_kappa_2$value))
```

### Pairwise Agreement: Gender

```{r}
gender_kappa_1 <- comb %>% select(speaker_face_gender1, speaker_face_gender) %>%
  kappam.fleiss(detail = TRUE)
gender_kappa_2 <- comb %>% select(speaker_face_gender2, speaker_face_gender) %>%
  kappam.fleiss(detail = TRUE)
gender_kappa_3 <- comb %>% select(speaker_face_gender2, speaker_face_gender1) %>%
  kappam.fleiss(detail = TRUE)

gender_kappa_1 # rater 1 / algorithm

gender_kappa_2 # rater 2 / algorithm

gender_kappa_3 # rater 1 / rater 2

# raters / machine average
mean(c(gender_kappa_1$value, gender_kappa_2$value))

```

### Overall Agreement: Ethnicity

```{r}
eth_all <- comb %>% select(contains('speaker_ethnicity')) %>% 
  kappam.fleiss(exact = FALSE, detail = TRUE)
eth_all
```


### Overall Agreement: Gender

```{r}
sex_all <- comb %>% select(contains('speaker_face')) %>% 
  kappam.fleiss(exact = FALSE, detail = TRUE)
sex_all
```



### Disagreements

#### Rater 1

```{r}
comb %>% select(speaker, speaker_face_gender, speaker_face_gender1,
                speaker_ethnicity, speaker_ethnicity1 ) %>% 
  filter(
            speaker_ethnicity != speaker_ethnicity1)  %>%
  arrange(speaker) 
```

#### Rater 2

```{r}
comb %>% select(speaker, speaker_face_gender, speaker_face_gender2,
                speaker_ethnicity, speaker_ethnicity2) %>% 
  filter(
            speaker_ethnicity != speaker_ethnicity2) %>% 
  arrange(speaker) 
```

#### Examples

```{r}
comb %>% filter(speaker %in%
c('Danielle Feinberg', 'David Pogue',
  'Sonaar Luthra', 'Sunitha Krishnan' )) %>% 
  select(sort(current_vars()))  
```

### Calculate Accuracy and F1 Scores


#### Ethnicity

Calculating precision, recall and F1 scores for multiclass applications (such as ethnicity in our case) is more complicated then for binary prediction tasks. See [this source](https://www.datascienceblog.net/post/machine-learning/performance-measures-multi-class-problems/) for a thorough explanation. Most of the code below is based upon the source.

```{r}
get.conf.stats <- function(cm) {
    out <- vector("list", length(cm))
    for (i in seq_along(cm)) {
        x <- cm[[i]]
        tp <- x$table[x$positive, x$positive] 
        fp <- sum(x$table[x$positive, colnames(x$table) != x$positive])
        fn <- sum(x$table[colnames(x$table) != x$positie, x$positive])
        elem <- c(tp = tp, fp = fp, fn = fn)
        out[[i]] <- elem
    }
    df <- do.call(rbind, out)
    rownames(df) <- unlist(lapply(cm, function(x) x$positive))
    return(as.data.frame(df))
}
get.micro.f1 <- function(cm) {
    cm.summary <- get.conf.stats(cm)
    tp <- sum(cm.summary$tp)
    fn <- sum(cm.summary$fn)
    fp <- sum(cm.summary$fp)
    pr <- tp / (tp + fp)
    re <- tp / (tp + fn)
    f1 <- 2 * ((pr * re) / (pr + re))
    return(f1)
}

get.macro.f1 <- function(cm) {
    c <- cm[[1]]$byClass # a single matrix is sufficient
    re <- sum(c[, "Recall"]) / nrow(c)
    pr <- sum(c[, "Precision"]) / nrow(c)
    f1 <- 2 * ((re * pr) / (re + pr))
    return(f1)
}

human_eth <- as.factor(comb$speaker_ethnicity1)
algo_eth <- as.factor(comb$speaker_ethnicity)
```

These are the metrics for each class:

```{r}
cm <- vector("list", length(levels(human_eth)))
for (i in seq_along(cm)) {
    positive.class <- levels(human_eth)[i]

    cm[[i]] <- confusionMatrix(human_eth, algo_eth, 
                               positive = positive.class)
}
metrics <- c("Precision", "Recall", "F1")

print(cm[[1]]$byClass[, metrics])

```

This is the overall micro averaged f1 score, which does not take imbalanced class distribution into account:

```{r}
micro.f1 <- get.micro.f1(cm)
print(paste0("Micro F1 is: ", round(micro.f1, 2)))

```

And this is the macro averaged f1 scores, which takes performance for each individual class into account:

```{r}
macro.f1 <- get.macro.f1(cm)
print(paste0("Macro F1 is: ", round(macro.f1, 2)))
```

#### Gender

For the binary gender category the metrics for precision, recall and F1 fall between 0.95 and 0.97:

```{r}
human_gender <- as.factor(comb$speaker_face_gender1)
algo_gender <- as.factor(comb$speaker_face_gender)

print(confusionMatrix(human_gender, algo_gender,
                      positive = 'female', mode = 'everything'))
```


### Predicting gender and racial categories with *wru*

```{r}
# extracting surnames
comb <- comb %>% mutate(
  surname = gsub("^.* ", "", speaker),
  first_name = str_split(speaker, " ") %>% map_chr(1))
comb %>% select(first_name, surname, speaker)
# extracting firstnames
```

#### Getting gender estimates

```{r}
name_gender <- findGivenNames(unique(comb$first_name),
               textPrepare = FALSE,  progress = FALSE) 
name_gender <- name_gender %>% 
                rename(first_name = name, gender_pred_name = gender) %>% 
                select(first_name, gender_pred_name)

comb <- comb %>% left_join(name_gender, by = 'first_name')
```

#### Getting ethnicity estimates

Based on [this paper](https://doi.org/10.1093/pan/mpw001):

```{r}
name_eth <- predict_race(voter.file = comb, surname.only = TRUE)
name_pred <- name_eth %>% select(starts_with('pred'))
name_pred_max <- colnames(name_pred)[max.col(name_pred ,ties.method="first")]
```



```{r}
comb$speaker_ethnicity_surname <-  case_when (
                 name_pred_max == 'pred.whi' ~ 'White',
                 name_pred_max == 'pred.bla' ~ 'Black',
                  name_pred_max == 'pred.asi' ~ 'Asian',
                  name_pred_max == 'pred.his' ~ 'Hispanic',
                 TRUE ~ 'Other'
                )
comb %>% count(speaker_ethnicity_surname)
```


### Comparison: Name-based vs human annotations

We can now repeat the process from above by comparing name-based predictions with the human annotations.

For ethnicity, the F1 scores are substantially higher in comparison to the image recognition algorithm:

```{r}

name_eth <- comb$speaker_ethnicity_surname %>% as.factor()

cm <- vector("list", length(levels(human_eth)))
for (i in seq_along(cm)) {
    positive.class <- levels(human_eth)[i]

    cm[[i]] <- confusionMatrix(human_eth, name_eth, 
                               positive = positive.class)
}
metrics <- c("Precision", "Recall", "F1")

print(cm[[1]]$byClass[, metrics])

micro.f1 <- get.micro.f1(cm)
print(paste0("Micro F1 is: ", round(micro.f1, 2)))

macro.f1 <- get.macro.f1(cm)
print(paste0("Macro F1 is: ", round(macro.f1, 2)))
```

Regarding gender, the performance is much closer to the image recognition algorithm, although still slightly worse:

```{r}
human_gender <- as.factor(comb$speaker_face_gender1)
name_gender <- as.factor(comb$gender_pred_name)

print(confusionMatrix(human_gender, name_gender,
                      positive = 'female', mode = 'everything'))
```

## R environment info

The following cell contains info about the R environment that was used for the analysis:

```{r}
sessionInfo()
```

